# general visualisation
library('ggplot2') # visualisation
library('scales') # visualisation
library('patchwork') # visualisation
library('RColorBrewer') # visualisation
library('corrplot') # visualisation

# general data manipulation
library('dplyr') # data manipulation
library('readr') # input/output
library('vroom') # input/output
library('skimr') # overview
library('tibble') # data wrangling
library('tidyr') # data wrangling
library('purrr') # data wrangling
library('stringr') # string manipulation
library('forcats') # factor manipulation

# specific visualisation
library('alluvial') # visualisation
library('ggrepel') # visualisation
library('ggforce') # visualisation
library('ggridges') # visualisation
library('gganimate') # animations
library('GGally') # visualisation
library('ggthemes') # visualisation
library('wesanderson') # visualisation
library('kableExtra') # display

# Date + forecast
library('lubridate') # date and time
library('forecast') # time series analysis
library('prophet') # time series analysis
library('timetk') # time series analysis

# Interactivity
library('crosstalk')
library('plotly')

# parallel
library('foreach')
library('doParallel')

get_binCI <- function(x,n) as.list(setNames(binom.test(x,n)$conf.int, c("lwr", "upr")))
library(mgcv)
clean_pages <- vroom(str_c('clean_pages.csv'), delim = ",", col_types = cols())
session_info <- vroom(str_c('sess.csv'), delim = ",", col_types = cols())
signals <- vroom(str_c('signals.csv'), delim = ",", col_types = cols())
clean_signals <- vroom(str_c('csignals.csv'), delim = ",", col_types = cols())
set.seed(4321)

clean_pages <- clean_pages %>%
  select(userId, time)

cols <- clean_pages %>% 
  distinct(userId) %>% 
  mutate(cols = rep_len(brewer.pal(7, "Set2"), length.out = n_distinct(clean_pages$userId)))

ts_out <- clean_pages %>% 
  left_join(cols, by = "userId") %>%
  mutate(time = as.POSIXct(as.numeric(time) %% 86400, origin="1970-01-01", tz="GMT")) 

pal <- cols$cols %>%
   setNames(cols$userId)

shared_ts <- highlight_key(ts_out)

palette(brewer.pal(100, "Set3"))

gg <- shared_ts %>% 
    ggplot(aes(time, fill = userId, group = userId)) +
  geom_histogram(bins=60) +
  scale_color_manual(values = pal) +
  labs(x = "Time", y = "Count") +
  theme_tufte() + 
  NULL

filter2 <- bscols(
  ggplotly(gg, dynamicTicks = TRUE),
  widths = c(12, 12)
)

bscols(filter2)
signal_with_id <- clean_signals %>% 
  group_by(userId, word) %>% 
  mutate(count = sequence(n()))

p <- signals %>% 
  select(signal, pos) %>% 
  count(pos) %>% 
  add_tally(n, name = "total") %>% 
  mutate(perc = n/total) %>% 
  ggplot(aes(reorder(pos, n, FUN = min), perc, fill = pos)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_hc() +
  theme(legend.position = "none") +
  labs(x = "", y = "", title = "Original")

p2 <- signals %>% 
  filter(signal == 0) %>% 
  select(signal, pos) %>% 
  count(pos) %>% 
  add_tally(n, name = "total") %>% 
  mutate(perc = n/total) %>% 
  ggplot(aes(reorder(pos, n, FUN = min), perc, fill = pos)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_hc() +
  theme(legend.position = "none") +
  labs(x = "", y = "", title = "Unknown")

layout <- "
AAABBB
"

p + p2  + plot_layout(design = layout)

diff_df <- signal_with_id %>%
    group_by(userId, word) %>%
    mutate(diff = time - lag(time)) %>%
    mutate(tran=paste(lag(signal),'->',signal)) %>%
    ungroup() %>%
    mutate(diff = diff/(60*60*24)) %>%
    filter(diff < 15) %>%
    filter(!(diff == 0 | is.na(diff))) %>%
    select(word, diff, signal, tran)

p <- diff_df %>% 
  ggplot(aes(x=diff, color=tran)) +
  stat_ecdf(geom="point", size=0.5)+
  theme_hc() +
  labs(x = "Days", y = "", title = "Wait Time CDF") + 
  scale_fill_discrete("")

p2 <- diff_df %>%
  count(tran) %>% 
  add_tally(n, name = "total") %>% 
  mutate(perc = n/total) %>% 
  ggplot(aes(reorder(tran, n, FUN = min), perc, fill = tran)) +
  geom_col() +
  scale_y_continuous(labels = scales::percent) +
  coord_flip() +
  theme_hc() +
  theme(legend.position = "none") +
  labs(x = "", y = "", title = "Transition Type")
 

p3 <- diff_df %>% 
  ggplot(aes(x=diff, fill='cdf', show.legend = FALSE)) +
  stat_ecdf(aes(ymin=0,ymax=..y..), geom = "ribbon") +
  theme_hc() +
  theme(legend.position="none") +
  labs(x = "Days", y = "", title = "Wait Time CDF by Transition Type") 

layout <- "
AAACC
BBBBB
BBBBB
BBBBB
"

p3 + p + p2 + plot_layout(design = layout)